{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://www.kaggle.com/kernels/scriptcontent/24052407/download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_is_bad_zero(Xy_subset, min_interval=48, summer_start=3000, summer_end=7500):\n",
    "    \"\"\"Helper routine for 'find_bad_zeros'.\n",
    "    \n",
    "    This operates upon a single dataframe produced by 'groupby'. We expect an \n",
    "    additional column 'meter_id' which is a duplicate of 'meter' because groupby \n",
    "    eliminates the original one.\"\"\"\n",
    "    meter = Xy_subset.meter_id.iloc[0]\n",
    "    is_zero = Xy_subset.meter_reading == 0\n",
    "    if meter == 0:\n",
    "        # Electrical meters should never be zero. Keep all zero-readings in this table so that\n",
    "        # they will all be dropped in the train set.\n",
    "        return is_zero\n",
    "\n",
    "    transitions = (is_zero != is_zero.shift(1))\n",
    "    all_sequence_ids = transitions.cumsum()\n",
    "    ids = all_sequence_ids[is_zero].rename(\"ids\")\n",
    "    if meter in [2, 3]:\n",
    "        # It's normal for steam and hotwater to be turned off during the summer\n",
    "        keep = set(ids[(Xy_subset.timestamp < summer_start) |\n",
    "                       (Xy_subset.timestamp > summer_end)].unique())\n",
    "        is_bad = ids.isin(keep) & (ids.map(ids.value_counts()) >= min_interval)\n",
    "    elif meter == 1:\n",
    "        time_ids = ids.to_frame().join(Xy_subset.timestamp).set_index(\"timestamp\").ids\n",
    "        is_bad = ids.map(ids.value_counts()) >= min_interval\n",
    "\n",
    "        # Cold water may be turned off during the winter\n",
    "        jan_id = time_ids.get(0, False)\n",
    "        dec_id = time_ids.get(8283, False)\n",
    "        if (jan_id and dec_id and jan_id == time_ids.get(500, False) and\n",
    "                dec_id == time_ids.get(8783, False)):\n",
    "            is_bad = is_bad & (~(ids.isin(set([jan_id, dec_id]))))\n",
    "    else:\n",
    "        raise Exception(f\"Unexpected meter type: {meter}\")\n",
    "\n",
    "    result = is_zero.copy()\n",
    "    result.update(is_bad)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# find_bad_zeros identifies rows with zero-readings that have the following characteristics:\n",
    "# all electrical readings with zero values.\n",
    "# 48+ hour runs of steam and hotwater zero-readings except for those \n",
    "# which are entirely contained within what we identify as typical \"core summer months\".\n",
    "# 48+ hour runs of chilledwater zero-readings except for those \n",
    "# which occur simultaneously at the start and end of the year (i.e. \"core winter months\").\n",
    "\n",
    "def find_bad_zeros(X, y):\n",
    "    \"\"\"Returns an Index object containing only the rows which should be deleted.\"\"\"\n",
    "    Xy = X.assign(meter_reading=y, meter_id=X.meter)\n",
    "    is_bad_zero = Xy.groupby([\"building_id\", \"meter\"]).apply(make_is_bad_zero)\n",
    "    return is_bad_zero[is_bad_zero].index.droplevel([0, 1])\n",
    "def find_bad_rows(X, y):\n",
    "    return find_bad_zeros(X, y).union(find_bad_sitezero(X)).union(find_bad_building1099(X, y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# find_bad_building1099 identifies the most absurdly high readings from building 1099. \n",
    "# These are orders of magnitude higher than all data, and have been emperically seen \n",
    "# in LB probes to be harmful outliers.\n",
    "def find_bad_building1099(X, y):\n",
    "    \"\"\"Returns indices of bad rows (with absurdly high readings) from building 1099.\"\"\"\n",
    "    return X[(X.building_id == 1099) & (X.meter == 2) & (y > 3e4)].index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# find_bad_sitezero identifies the \"known-bad\" \n",
    "# electrical readings from the first 141 days of the data for site 0 (i.e. UCF).\n",
    "def find_bad_sitezero(X):\n",
    "    \"\"\"Returns indices of bad rows from the early days of Site 0 (UCF).\"\"\"\n",
    "    return X[(X.timestamp < 3378) & (X.site_id == 0) & (X.meter == 0)].index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compress_dataframe(df):\n",
    "    result = df.copy()\n",
    "    for col in result.columns:\n",
    "        col_data = result[col]\n",
    "        dn = col_data.dtype.name\n",
    "        if dn == \"object\":\n",
    "            result[col] = pd.to_numeric(col_data.astype(\"category\").cat.codes, downcast=\"integer\")\n",
    "        elif dn == \"bool\":\n",
    "            result[col] = col_data.astype(\"int8\")\n",
    "        elif dn.startswith(\"int\") or (col_data.round() == col_data).all():\n",
    "            result[col] = pd.to_numeric(col_data, downcast=\"integer\")\n",
    "        else:\n",
    "            result[col] = pd.to_numeric(col_data, downcast='float')\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = compress_dataframe(pd.read_csv('../../Resources/train.csv')\\\n",
    "                              .merge(pd.read_csv('../../Resources/building_metadata.csv')),\n",
    "                              on = 'building_id', how = 'left')\n",
    "bad_rows = find_bad_rows(df_train, df_train['meter_reading'])\n",
    "pd.Series(bad_rows.sort_values()).to_csv(\"../../Large_output/rows_to_drop.csv\", header=False, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
